/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    RandomSubSpace.java
 *    Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.meta;

import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.Classifier;
import weka.classifiers.RandomizableParallelIteratedSingleClassifierEnhancer;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.unsupervised.attribute.Remove;

/**
 * <!-- globalinfo-start --> This method constructs a decision tree based
 * classifier that maintains highest accuracy on training data and improves on
 * generalization accuracy as it grows in complexity. The classifier consists of
 * multiple trees constructed systematically by pseudorandomly selecting subsets
 * of components of the feature vector, that is, trees constructed in randomly
 * chosen subspaces.<br/>
 * <br/>
 * For more information, see<br/>
 * <br/>
 * Tin Kam Ho (1998). The Random Subspace Method for Constructing Decision
 * Forests. IEEE Transactions on Pattern Analysis and Machine Intelligence.
 * 20(8):832-844. URL http://citeseer.ist.psu.edu/ho98random.html.
 * <p/>
 * <!-- globalinfo-end -->
 *
 * <!-- technical-bibtex-start --> BibTeX:
 * 
 * <pre>
 * &#64;article{Ho1998,
 *    author = {Tin Kam Ho},
 *    journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
 *    number = {8},
 *    pages = {832-844},
 *    title = {The Random Subspace Method for Constructing Decision Forests},
 *    volume = {20},
 *    year = {1998},
 *    ISSN = {0162-8828},
 *    URL = {http://citeseer.ist.psu.edu/ho98random.html}
 * }
 * </pre>
 * <p/>
 * <!-- technical-bibtex-end -->
 *
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 *  -P
 *  Size of each subspace:
 *   &lt; 1: percentage of the number of attributes
 *   &gt;=1: absolute number of attributes
 * </pre>
 * 
 * <pre>
 *  -S &lt;num&gt;
 *  Random number seed.
 *  (default 1)
 * </pre>
 * 
 * <pre>
 *  -I &lt;num&gt;
 *  Number of iterations.
 *  (default 10)
 * </pre>
 * 
 * <pre>
 *  -D
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * </pre>
 * 
 * <pre>
 *  -W
 *  Full name of base classifier.
 *  (default: weka.classifiers.trees.REPTree)
 * </pre>
 * 
 * <pre>
 *  
 * Options specific to classifier weka.classifiers.trees.REPTree:
 * </pre>
 * 
 * <pre>
 *  -M &lt;minimum number of instances&gt;
 *  Set minimum number of instances per leaf (default 2).
 * </pre>
 * 
 * <pre>
 *  -V &lt;minimum variance for split&gt;
 *  Set minimum numeric class variance proportion
 *  of train variance for split (default 1e-3).
 * </pre>
 * 
 * <pre>
 *  -N &lt;number of folds&gt;
 *  Number of folds for reduced error pruning (default 3).
 * </pre>
 * 
 * <pre>
 *  -S &lt;seed&gt;
 *  Seed for random data shuffling (default 1).
 * </pre>
 * 
 * <pre>
 *  -P
 *  No pruning.
 * </pre>
 * 
 * <pre>
 *  -L
 *  Maximum tree depth (default -1, no maximum)
 * </pre>
 * 
 * <!-- options-end -->
 *
 * Options after -- are passed to the designated classifier.
 * <p>
 *
 * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
 * @author Peter Reutemann (fracpete@cs.waikato.ac.nz)
 * @version $Revision$
 */
public class RandomSubSpace extends RandomizableParallelIteratedSingleClassifierEnhancer implements WeightedInstancesHandler, TechnicalInformationHandler {

    /** for serialization */
    private static final long serialVersionUID = 1278172513912424947L;

    /** The size of each bag sample, as a percentage of the training size */
    protected double m_SubSpaceSize = 0.5;

    /** a ZeroR model in case no model can be built from the data */
    protected Classifier m_ZeroR;

    /** Training data */
    protected Instances m_data;

    /**
     * Constructor.
     */
    public RandomSubSpace() {
        super();

        m_Classifier = new weka.classifiers.trees.REPTree();
    }

    /**
     * Returns a string describing classifier
     * 
     * @return a description suitable for displaying in the explorer/experimenter
     *         gui
     */
    public String globalInfo() {
        return "This method constructs a decision tree based classifier that " + "maintains highest accuracy on training data and improves on " + "generalization accuracy as it grows in complexity. The classifier " + "consists of multiple trees constructed systematically by " + "pseudorandomly selecting subsets of components of the feature vector, " + "that is, trees constructed in randomly chosen subspaces.\n\n" + "For more information, see\n\n" + getTechnicalInformation().toString();
    }

    /**
     * Returns an instance of a TechnicalInformation object, containing detailed
     * information about the technical background of this class, e.g., paper
     * reference or book this class is based on.
     * 
     * @return the technical information about this class
     */
    public TechnicalInformation getTechnicalInformation() {
        TechnicalInformation result;

        result = new TechnicalInformation(Type.ARTICLE);
        result.setValue(Field.AUTHOR, "Tin Kam Ho");
        result.setValue(Field.YEAR, "1998");
        result.setValue(Field.TITLE, "The Random Subspace Method for Constructing Decision Forests");
        result.setValue(Field.JOURNAL, "IEEE Transactions on Pattern Analysis and Machine Intelligence");
        result.setValue(Field.VOLUME, "20");
        result.setValue(Field.NUMBER, "8");
        result.setValue(Field.PAGES, "832-844");
        result.setValue(Field.URL, "http://citeseer.ist.psu.edu/ho98random.html");
        result.setValue(Field.ISSN, "0162-8828");

        return result;
    }

    /**
     * String describing default classifier.
     * 
     * @return the default classifier classname
     */
    protected String defaultClassifierString() {
        return "weka.classifiers.trees.REPTree";
    }

    /**
     * Returns an enumeration describing the available options.
     *
     * @return an enumeration of all the available options.
     */
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();

        result.addElement(new Option("\tSize of each subspace:\n" + "\t\t< 1: percentage of the number of attributes\n" + "\t\t>=1: absolute number of attributes\n", "P", 1, "-P"));

        result.addAll(Collections.list(super.listOptions()));

        return result.elements();
    }

    /**
     * Parses a given list of options.
     * <p/>
     *
     * <!-- options-start --> Valid options are:
     * <p/>
     * 
     * <pre>
     *  -P
     *  Size of each subspace:
     *   &lt; 1: percentage of the number of attributes
     *   &gt;=1: absolute number of attributes
     * </pre>
     * 
     * <pre>
     *  -S &lt;num&gt;
     *  Random number seed.
     *  (default 1)
     * </pre>
     * 
     * <pre>
     *  -I &lt;num&gt;
     *  Number of iterations.
     *  (default 10)
     * </pre>
     * 
     * <pre>
     *  -D
     *  If set, classifier is run in debug mode and
     *  may output additional info to the console
     * </pre>
     * 
     * <pre>
     *  -W
     *  Full name of base classifier.
     *  (default: weka.classifiers.trees.REPTree)
     * </pre>
     * 
     * <pre>
     *  
     * Options specific to classifier weka.classifiers.trees.REPTree:
     * </pre>
     * 
     * <pre>
     *  -M &lt;minimum number of instances&gt;
     *  Set minimum number of instances per leaf (default 2).
     * </pre>
     * 
     * <pre>
     *  -V &lt;minimum variance for split&gt;
     *  Set minimum numeric class variance proportion
     *  of train variance for split (default 1e-3).
     * </pre>
     * 
     * <pre>
     *  -N &lt;number of folds&gt;
     *  Number of folds for reduced error pruning (default 3).
     * </pre>
     * 
     * <pre>
     *  -S &lt;seed&gt;
     *  Seed for random data shuffling (default 1).
     * </pre>
     * 
     * <pre>
     *  -P
     *  No pruning.
     * </pre>
     * 
     * <pre>
     *  -L
     *  Maximum tree depth (default -1, no maximum)
     * </pre>
     * 
     * <!-- options-end -->
     *
     * Options after -- are passed to the designated classifier.
     * <p>
     *
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {
        String tmpStr;

        tmpStr = Utils.getOption('P', options);
        if (tmpStr.length() != 0)
            setSubSpaceSize(Double.parseDouble(tmpStr));
        else
            setSubSpaceSize(0.5);

        super.setOptions(options);

        Utils.checkForRemainingOptions(options);
    }

    /**
     * Gets the current settings of the Classifier.
     *
     * @return an array of strings suitable for passing to setOptions
     */
    public String[] getOptions() {
        Vector<String> result = new Vector<String>();

        result.add("-P");
        result.add("" + getSubSpaceSize());

        Collections.addAll(result, super.getOptions());

        return (String[]) result.toArray(new String[result.size()]);
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String subSpaceSizeTipText() {
        return "Size of each subSpace: if less than 1 as a percentage of the " + "number of attributes, otherwise the absolute number of attributes.";
    }

    /**
     * Gets the size of each subSpace, as a percentage of the training set size.
     *
     * @return the subSpace size, as a percentage.
     */
    public double getSubSpaceSize() {
        return m_SubSpaceSize;
    }

    /**
     * Sets the size of each subSpace, as a percentage of the training set size.
     *
     * @param value the subSpace size, as a percentage.
     */
    public void setSubSpaceSize(double value) {
        m_SubSpaceSize = value;
    }

    /**
     * calculates the number of attributes
     * 
     * @param total    the available number of attributes
     * @param fraction the fraction - if less than 1 it represents the percentage,
     *                 otherwise the absolute number of attributes
     * @return the number of attributes to use
     */
    protected int numberOfAttributes(int total, double fraction) {
        int k = (int) Math.round((fraction < 1.0) ? total * fraction : fraction);

        if (k > total)
            k = total;
        if (k < 1)
            k = 1;

        return k;
    }

    /**
     * generates an index string describing a random subspace, suitable for the
     * Remove filter.
     * 
     * @param indices      the attribute indices
     * @param subSpaceSize the size of the subspace
     * @param classIndex   the class index
     * @param random       the random number generator
     * @return the generated string describing the subspace
     */
    protected String randomSubSpace(Integer[] indices, int subSpaceSize, int classIndex, Random random) {
        Collections.shuffle(Arrays.asList(indices), random);
        StringBuffer sb = new StringBuffer("");
        for (int i = 0; i < subSpaceSize; i++) {
            sb.append(indices[i] + ",");
        }
        sb.append(classIndex);

        if (getDebug())
            System.out.println("subSPACE = " + sb);

        return sb.toString();
    }

    /**
     * builds the classifier.
     *
     * @param data the training data to be used for generating the classifier.
     * @throws Exception if the classifier could not be built successfully
     */
    public void buildClassifier(Instances data) throws Exception {

        // can classifier handle the data?
        getCapabilities().testWithFail(data);

        // get fresh Instances object
        m_data = new Instances(data);

        // only class? -> build ZeroR model
        if (m_data.numAttributes() == 1) {
            System.err.println("Cannot build model (only class attribute present in data!), " + "using ZeroR model instead!");
            m_ZeroR = new weka.classifiers.rules.ZeroR();
            m_ZeroR.buildClassifier(m_data);
            return;
        } else {
            m_ZeroR = null;
        }

        super.buildClassifier(data);

        Integer[] indices = new Integer[data.numAttributes() - 1];
        int classIndex = data.classIndex();
        int offset = 0;
        for (int i = 0; i < indices.length + 1; i++) {
            if (i != classIndex) {
                indices[offset++] = i + 1;
            }
        }
        int subSpaceSize = numberOfAttributes(indices.length, getSubSpaceSize());
        Random random = data.getRandomNumberGenerator(m_Seed);

        for (int j = 0; j < m_Classifiers.length; j++) {
            FilteredClassifier fc = new FilteredClassifier();
            fc.setSeed(random.nextInt());
            fc.setClassifier(m_Classifiers[j]);
            m_Classifiers[j] = fc;
            Remove rm = new Remove();
            rm.setOptions(new String[] { "-V", "-R", randomSubSpace(indices, subSpaceSize, classIndex + 1, random) });
            fc.setFilter(rm);
        }

        buildClassifiers();

        // save memory
        m_data = null;
    }

    /**
     * Returns a training set for a particular iteration.
     * 
     * @param iteration the number of the iteration for the requested training set.
     * @return the training set for the supplied iteration number
     * @throws Exception if something goes wrong when generating a training set.
     */
    protected synchronized Instances getTrainingSet(int iteration) throws Exception {

        // We don't manipulate the training data in any way. The FilteredClassifiers
        // take care of generating the sub-spaces.
        return m_data;
    }

    /**
     * Calculates the class membership probabilities for the given test instance.
     *
     * @param instance the instance to be classified
     * @return preedicted class probability distribution
     * @throws Exception if distribution can't be computed successfully
     */
    public double[] distributionForInstance(Instance instance) throws Exception {

        // default model?
        if (m_ZeroR != null) {
            return m_ZeroR.distributionForInstance(instance);
        }

        double[] sums = new double[instance.numClasses()], newProbs;

        double numPreds = 0;
        for (int i = 0; i < m_NumIterations; i++) {
            if (instance.classAttribute().isNumeric() == true) {
                double pred = m_Classifiers[i].classifyInstance(instance);
                if (!Utils.isMissingValue(pred)) {
                    sums[0] += pred;
                    numPreds++;
                }
            } else {
                newProbs = m_Classifiers[i].distributionForInstance(instance);
                for (int j = 0; j < newProbs.length; j++)
                    sums[j] += newProbs[j];
            }
        }
        if (instance.classAttribute().isNumeric() == true) {
            if (numPreds == 0) {
                sums[0] = Utils.missingValue();
            } else {
                sums[0] /= numPreds;
            }
            return sums;
        } else if (Utils.eq(Utils.sum(sums), 0)) {
            return sums;
        } else {
            Utils.normalize(sums);
            return sums;
        }
    }

    /**
     * Returns description of the bagged classifier.
     *
     * @return description of the bagged classifier as a string
     */
    public String toString() {

        // only ZeroR model?
        if (m_ZeroR != null) {
            StringBuffer buf = new StringBuffer();
            buf.append(this.getClass().getName().replaceAll(".*\\.", "") + "\n");
            buf.append(this.getClass().getName().replaceAll(".*\\.", "").replaceAll(".", "=") + "\n\n");
            buf.append("Warning: No model could be built, hence ZeroR model is used:\n\n");
            buf.append(m_ZeroR.toString());
            return buf.toString();
        }

        if (m_Classifiers == null) {
            return "RandomSubSpace: No model built yet.";
        }
        StringBuffer text = new StringBuffer();
        text.append("All the base classifiers: \n\n");
        for (int i = 0; i < m_Classifiers.length; i++)
            text.append(m_Classifiers[i].toString() + "\n\n");

        return text.toString();
    }

    /**
     * Main method for testing this class.
     *
     * @param args the options
     */
    public static void main(String[] args) {
        runClassifier(new RandomSubSpace(), args);
    }
}
